We import some standard python data analysis packages and do authentication on Google Drive
from __future__ import print_function
from google.colab import auth
from google.cloud import bigquery
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
auth.authenticate_user()
Next we will need to enter some information on how to access the data.
analysis_project is the project used for processing the queries.
#@title Fill out this form then press [shift ⇧]+[enter ⏎] {run: "auto"}
import subprocess
import re
analysis_project = 'physionet-data-275415' #@param {type:"string"}
admissions_table = 'physionet-data.mimiciii_clinical.admissions' # @param {type: "string"}
d_icd_diagnoses_table = 'physionet-data.mimiciii_clinical.d_icd_diagnoses' # @param {type: "string"}
diagnoses_icd_table = 'physionet-data.mimiciii_clinical.diagnoses_icd' # @param {type: "string"}
patients_table = 'physionet-data.mimiciii_clinical.patients' # @param {type: "string"}
labs_event = 'physionet-data.mimiciii_clinical.labevents' # @param {type: "string"}
labs_items = 'physionet-data.mimiciii_clinical.d_labitems' # @param {type: "string"}
# Preprocess queries made with the %%bigquery magic
# by substituting these values
sub_dict = {
'analysis_project': analysis_project,
'admissions_table': admissions_table,
'd_icd_diagnoses_table': d_icd_diagnoses_table,
'diagnoses_icd_table': diagnoses_icd_table,
'patients_table': patients_table,
'ml_table_prefix': analysis_project + '.MIMIC.models_',
'labs_event':labs_event,
'labs_items':labs_items
}
# Set the default project for running queries
bigquery.magics.context.project = analysis_project
# Set up the substitution preprocessing injection
# if bigquery.magics._run_query.func_name != 'format_and_run_query':
# original_run_query = bigquery.magics._run_query
original_run_query = bigquery.magics._run_query
def format_and_run_query(client, query, job_config=None):
query = query.format(**sub_dict)
return original_run_query(client, query, job_config)
bigquery.magics._run_query = format_and_run_query
print('analysis_project:', analysis_project)
print()
print('custom %%bigquery magic substitutions:')
for k, v in sub_dict.items():
print(' ', '{%s}' % k, '→', v)
%config InlineBackend.figure_format = 'svg'
bq = bigquery.Client(project=analysis_project)
Create data set named MIMIC
if 'MIMIC' not in [d.dataset_id for d in list(bq.list_datasets())]:
dataset_id = "{}.MIMIC".format(bq.project)
# Construct a full Dataset object to send to the API.
# Send the dataset to the API for creation.
# Raises google.api_core.exceptions.Conflict if the Dataset already
# exists within the project.
dataset = bigquery.Dataset(dataset_id)
dataset = bq.create_dataset(dataset) # Make an API request.
In the intensive care unit, are more labs events or less likely to be fatal?
%%bigquery hist_df
SELECT
n_labs, COUNT(*) AS cnt
FROM (
SELECT
HADM_ID,COUNT(*) AS n_labs
FROM
`{labs_event}`
WHERE HADM_ID IS NOT NULL
GROUP BY
HADM_ID
)
GROUP BY n_labs
ORDER BY n_labs
plt.figure(figsize=(15,4))
g = sns.barplot(
x=hist_df.n_labs, y=hist_df.cnt, color=sns.color_palette()[0])
for i, label in enumerate(g.get_xticklabels()):
if i % 30 != 4 and i != 0:
label.set_visible(False)
plt.xticks(rotation=90)
plt.xlabel('Number of labs')
plt.ylabel('Count')
plt.xlim(0,1000)
Zoom in
plt.figure(figsize=(15,4))
g = sns.barplot(
x=hist_df.n_labs, y=hist_df.cnt, color=sns.color_palette()[0])
for i, label in enumerate(g.get_xticklabels()):
if i % 30 != 4 and i != 0:
label.set_visible(False)
plt.xticks(rotation=90)
plt.xlabel('Number of labs')
plt.ylabel('Count')
plt.xlim(0,650)
The mathematical explanation of this is called central limit theorem. While this is by no means a deal breaker, the thins tails we see in the distribution can be a challenge for linear-regression models. This is because the extreme points tend to affect the likelihood the most, so having fewer of them makes your model more sensitive to outliers. Regularization can help with this, but if it becomes too much of a problem we can consider a different type of model (such as support-vector machines, or robust regression) instead of generalized linear regression.
%%bigquery
# BigQuery ML create model statement:
CREATE OR REPLACE MODEL `{ml_table_prefix}mortality_models_according_labs_events`
OPTIONS(
# Use logistic_reg for discrete predictions (classification) and linear_reg
# for continuous predictions (forecasting).
model_type = 'logistic_reg',
# See the below aside (𝜎 = 0.5 ⇒ 𝜆 = 2)
l2_reg = 2,
# Identify the column to use as the label (dependent variable)
input_label_cols = ["died"]
)
AS
# standard SQL query to train the model with:
SELECT
COUNT(*) AS event_labs,
MAX(HOSPITAL_EXPIRE_FLAG) as died
FROM
`{admissions_table}`
INNER JOIN `{labs_event}`
USING (HADM_ID)
GROUP BY HADM_ID
%%bigquery scatter_df
SELECT
COUNT(*) AS event_labs,
MAX(HOSPITAL_EXPIRE_FLAG) as died
FROM
`{admissions_table}`
INNER JOIN `{labs_event}`
USING (HADM_ID)
GROUP BY HADM_ID
This is an example of the training table.
scatter_df
Its time to check model weights
%%bigquery event_labs_model_weights
SELECT * FROM ML.WEIGHTS(MODEL `{ml_table_prefix}mortality_models_according_labs_events`)
event_labs_model_weights
I see that event_labs has small weight and it is not enough to good predict
params = {'max_prediction': hist_df.n_labs.max()}
%%bigquery line_df --params $params
SELECT * FROM
ML.PREDICT(MODEL `{ml_table_prefix}complexity_mortality_models`, (
SELECT * FROM
UNNEST(GENERATE_ARRAY(1, @max_prediction)) AS event_labs
))
This is predict probabilities table.
line_df
plt.figure(figsize=(10,5))
sns.regplot(
x='event_labs',
y='died',
data=scatter_df,
fit_reg=False,
x_bins=np.arange(1,
scatter_df.event_labs.max() + 1)
)
plt.plot(line_df.event_labs,
line_df.predicted_died_probs.apply(lambda x: x[0]['prob']))
plt.xlabel('Number of labs')
plt.ylabel('Probability of death during admission')
plt.xticks(rotation=90)
plt.xlim(0,4000)
Zoom in
plt.figure(figsize=(10,5))
sns.regplot(
x='event_labs',
y='died',
data=scatter_df,
fit_reg=False,
x_bins=np.arange(1,
scatter_df.event_labs.max() + 1)
)
plt.plot(line_df.event_labs,
line_df.predicted_died_probs.apply(lambda x: x[0]['prob']))
plt.xlabel('Number of labs')
plt.ylabel('Probability of death during admission')
plt.xticks(rotation=90)
plt.xlim(0,500)
Based on the graph, I believe that this model cannot provide us with good prediction accuracy. Therefore, I conclude that there is no relationship between the number of labs and the probability of death. Nevertheless, we will try to improve it
%%bigquery DF_TOLEARNING
SELECT
IF(DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25 < 200, DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25, 95) AS AGE,
MARITAL_STATUS,
GENDER,
ADMISSION_TYPE,
ADMISSION_LOCATION,
INSURANCE,
ETHNICITY,
num_labs as NUM_LABS,
died
FROM
(SELECT
COUNT(*) AS num_labs,
MAX(HOSPITAL_EXPIRE_FLAG) as died,
ANY_VALUE(ADMITTIME) as ADMITTIME,
ANY_VALUE(MARITAL_STATUS) as MARITAL_STATUS,
ANY_VALUE(ADMISSION_TYPE) as ADMISSION_TYPE,
ANY_VALUE(ADMISSION_LOCATION) as ADMISSION_LOCATION,
ANY_VALUE(INSURANCE) as INSURANCE,
ANY_VALUE(ETHNICITY) as ETHNICITY,
SUBJECT_ID
FROM
`{admissions_table}` AS adm
JOIN `{labs_event}` AS labs
USING (HADM_ID, SUBJECT_ID)
GROUP BY HADM_ID, SUBJECT_ID
)
JOIN `{patients_table}` AS patients
USING (SUBJECT_ID)
Example table for train
DF_TOLEARNING
Time to train our model
%%bigquery
CREATE OR REPLACE MODEL `{ml_table_prefix}complexity_feature_mortality`
OPTIONS(model_type='logistic_reg', l2_reg=2, input_label_cols=["died"])
AS
SELECT
IF(DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25 < 200, DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25, 95) AS age,
MARITAL_STATUS,
GENDER,
ADMISSION_TYPE,
ADMISSION_LOCATION,
INSURANCE,
ETHNICITY,
num_labs,
died
FROM
(SELECT
COUNT(*) AS num_labs,
MAX(HOSPITAL_EXPIRE_FLAG) as died,
ANY_VALUE(ADMITTIME) as ADMITTIME,
ANY_VALUE(MARITAL_STATUS) as MARITAL_STATUS,
ANY_VALUE(ADMISSION_TYPE) as ADMISSION_TYPE,
ANY_VALUE(ADMISSION_LOCATION) as ADMISSION_LOCATION,
ANY_VALUE(INSURANCE) as INSURANCE,
ANY_VALUE(ETHNICITY) as ETHNICITY,
SUBJECT_ID
FROM
`{admissions_table}` AS adm
JOIN `{labs_event}` AS labs
USING (HADM_ID, SUBJECT_ID)
GROUP BY HADM_ID, SUBJECT_ID
)
JOIN `{patients_table}` AS patients
USING (SUBJECT_ID)
%%bigquery weight_all
SELECT * FROM ML.WEIGHTS(MODEL `{ml_table_prefix}complexity_feature_mortality`)
ORDER BY weight DESC
Table with model weights
weight_all
We see that num_labs still has a small weight. Age has bigger weight, but it seems to me that it is still not big enough. We open the weight in categorical variables
title=weight_all['processed_input'][3:].to_list()
s=0
for i in weight_all['category_weights']:
if len(i)>1:
print(title[s])
for x in i:
print(x['category'],x['weight'])
else:
continue
s+=1
print("____________\n")
We see that the weights of the new variables are greater than the weight of the number of laboratory tests. This means that the new variables are more influence on the accuracy of the predict.
def set_precision(df):
df['precision'] = df.true_positives / (df.true_positives + df.false_positives)
def plot_precision_recall(df, label=None):
# manually add the threshold = -∞ point
df = df[df.true_positives != 0]
recall = [0] + list(df.recall)
precision = [1] + list(df.precision)
# x=recall, y=precision line chart
plt.plot(recall, precision, label=label)
%%bigquery model1
SELECT * FROM ML.ROC_CURVE(MODEL `{ml_table_prefix}mortality_models_according_labs_events`)
%%bigquery model2
SELECT * FROM ML.ROC_CURVE(MODEL `{ml_table_prefix}complexity_feature_mortality`)
eval_queries = list()
for m in ['mortality_models_according_labs_events','complexity_feature_mortality']:
eval_queries.append(
'SELECT * FROM ML.EVALUATE('
'MODEL `{ml_table_prefix}{m}`)'
.format(m=m, **sub_dict))
eval_query = '\nUNION ALL\n'.join(eval_queries)
d=bq.query(eval_query).result().to_dataframe()
d.index=['models_according_labs_events','models_according_labs_events+features']
d
set_precision(model1)
set_precision(model2)
plot_precision_recall(model1, label='only_labs')
plot_precision_recall(model2, label='all_features')
# plt.plot(
# np.linspace(0, 1, 2), [comp_roc.precision.min()] * 2,
# label='null model',
# linestyle='--')
plt.legend()
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel(r'Recall $\left(\frac{T_p}{T_p + F_n} \right)$')
plt.ylabel(r'Precision $\left(\frac{T_p}{T_p + F_p} \right)$')
From the ROC curves we see that adding the variables regarding a patient information and their admission improves the model.
Of course, neither of these models is very good when it comes to making predictions. For our last set of models, we'll try more earnestly to predict patient mortality.
Where 𝑚∈ {8,16,32,64,128,256,512}
Firstly, make up the top lab events
%%bigquery top_labs
SELECT ITEMID,COUNT,LABEL
FROM
(SELECT ITEMID,COUNT(*) AS COUNT
FROM `{labs_event}`
GROUP BY (ITEMID))
JOIN `{labs_items}`
USING(ITEMID)
ORDER BY COUNT DESC
top_labs
Example table for train
%%bigquery
WITH labs AS (
SELECT
HADM_ID,
COUNT(*) AS COUNT_LABS,
MAX(IF(ITEMID = 51221, 1.0, 0.0)) as `lab_51221`,
MAX(IF(ITEMID = 50971, 1.0, 0.0)) as `lab_50971`,
MAX(IF(ITEMID = 50983, 1.0, 0.0)) as `lab_50983`,
MAX(IF(ITEMID = 50912, 1.0, 0.0)) as `lab_50912`,
MAX(IF(ITEMID = 50902, 1.0, 0.0)) as `lab_50902`,
MAX(IF(ITEMID = 51006, 1.0, 0.0)) as `lab_51006`,
MAX(IF(ITEMID = 50882, 1.0, 0.0)) as `lab_50882`,
MAX(IF(ITEMID = 51265, 1.0, 0.0)) as `lab_51265`,
MAX(IF(ITEMID = 50868, 1.0, 0.0)) as `lab_50868`,
MAX(IF(ITEMID = 51301, 1.0, 0.0)) as `lab_51301`,
MAX(IF(ITEMID = 51222, 1.0, 0.0)) as `lab_51222`,
MAX(IF(ITEMID = 50931, 1.0, 0.0)) as `lab_50931`,
MAX(IF(ITEMID = 51249, 1.0, 0.0)) as `lab_51249`,
MAX(IF(ITEMID = 51279, 1.0, 0.0)) as `lab_51279`,
MAX(IF(ITEMID = 51248, 1.0, 0.0)) as `lab_51248`,
MAX(IF(ITEMID = 51250, 1.0, 0.0)) as `lab_51250`
FROM `physionet-data.mimiciii_clinical.labevents`
GROUP BY HADM_ID
)
SELECT
* EXCEPT (SUBJECT_ID,HADM_ID,ITEMID,ADMITTIME,DOB),
IF(DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25 < 200,
DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25, 95) AS age,
FROM
(SELECT SUBJECT_ID,HADM_ID,ITEMID FROM `physionet-data.mimiciii_clinical.labevents`)
JOIN
(SELECT SUBJECT_ID,HADM_ID,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,MARITAL_STATUS,ETHNICITY ,ADMITTIME,HOSPITAL_EXPIRE_FLAG AS died FROM `physionet-data.mimiciii_clinical.admissions`)
USING (SUBJECT_ID,HADM_ID)
JOIN
(SELECT DOB,GENDER,SUBJECT_ID FROM`physionet-data.mimiciii_clinical.patients`)
USING (SUBJECT_ID)
JOIN labs
USING (HADM_ID)
limit 100
Time to train model
This time around we're using l1_reg instead of l2_reg because we expect that some of our some of our many variables will not significantly impact the outcome, and we would prefer a sparse model if possible.
top_m_labs = (8, 16, 32, 64, 128, 256, 512)
query_jobs = list()
for m in top_m_labs:
labs_columns = list()
for _, row in top_labs.iloc[:m].iterrows():
labs_columns.append('MAX(IF(ITEMID = {0}, 1.0, 0.0))'
' as `lab_{0}`'.format(row.ITEMID))
query = """
CREATE OR REPLACE MODEL `{ml_table_prefix}mortality_feature_top_labs_{m}`
OPTIONS(model_type='logistic_reg', l1_reg=2, input_label_cols=["died"])
AS
WITH labs AS (
SELECT
HADM_ID,
COUNT(*) AS COUNT_LABS,
{labs_columns}
FROM `{labs_event}`
GROUP BY HADM_ID
)
SELECT
* EXCEPT (SUBJECT_ID,HADM_ID,ITEMID,ADMITTIME,DOB),
IF(DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25 < 200,
DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25, 95) AS age,
FROM
(SELECT SUBJECT_ID,HADM_ID,ITEMID FROM `{labs_event}`)
JOIN
(SELECT SUBJECT_ID,HADM_ID,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,MARITAL_STATUS,ETHNICITY ,ADMITTIME,HOSPITAL_EXPIRE_FLAG AS died FROM `{admissions_table}`)
USING (SUBJECT_ID,HADM_ID)
JOIN
(SELECT DOB,GENDER,SUBJECT_ID FROM`{patients_table}`)
USING (SUBJECT_ID)
JOIN labs
USING (HADM_ID)
""".format(m=m,labs_columns=',\n '.join(labs_columns),**sub_dict)
query_jobs.append(bq.query(query))
for j in query_jobs:
j.exception()
Time to plot ROC curve
eval_queries = list()
for m in top_m_labs:
eval_queries.append(
'SELECT * FROM ML.EVALUATE('
'MODEL `{ml_table_prefix}mortality_feature_top_labs_{}`)'
.format(m, **sub_dict))
eval_query = '\nUNION ALL\n'.join(eval_queries)
bq.query(eval_query).result().to_dataframe()
Plotting ROC and precision-recall curves
for m in top_m_labs:
df = bq.query('SELECT * FROM ML.ROC_CURVE('
'MODEL `{ml_table_prefix}mortality_feature_top_labs_{}`)'
.format(m, **sub_dict)).result().to_dataframe()
set_precision(df)
plot_precision_recall(df, label='{} labs'.format(m))
# plt.plot(
# np.linspace(0, 1, 2), [df.precision.min()] * 2,
# label='null model',
# linestyle='--')
plt.legend()
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel(r'Recall $\left(\frac{T_p}{T_p + F_n} \right)$')
plt.ylabel(r'Precision $\left(\frac{T_p}{T_p + F_p} \right)$')
The model with m = 512 seems to be overfitting the data, while somewhere between m = 128 and m = 256 seems to be the sweet spot for model flexibility.
Actually, the predictive power of our model¹ isn't nearly as interesting as it's weights and what they tell us. In the next section, we'll dig into them.
Let's have a look at the weights from the m = 128 model.
%%bigquery weights_128
SELECT * FROM ML.WEIGHTS(MODEL `{ml_table_prefix}mortality_feature_top_labs_128`)
ORDER BY weight DESC
weights_128
First we'll look at the weights for the numerical inputs.
pd.options.display.max_rows=500
weights_128['ITEMID'] = weights_128.processed_input \
.apply(lambda x: int(x.split('_')[1]) if x.startswith('lab_') else x)
weights_128
view_df = weights_128.merge(top_labs,how='left', on='ITEMID')
view_df = view_df[~pd.isnull(view_df.weight)]
view_df[['processed_input', 'LABEL', 'weight', 'COUNT']]
We see have a list of lab events, sorted from most fatal to least fatal according to our model.
Going back to our original question, we can see that the weight for COUNT_LABS (a.k.a the number of lab events) has essentially gone to zero (0.000007). The average lab events weight is also very small:
view_df[~pd.isnull(view_df.LABEL)].weight.mean()
So we can conclude that given that a patient has been admitted to the ICU, the number of lab events they've been given does not predict their outcome beyond the linear effect of the component diagnoses.
Let's try to improve our model by adding a diagnosis
Where 𝑚∈ {8,16,32,64,128,256,512}
Top 256 diagnoses
%%bigquery top_diagnoses
WITH top_diag AS (
SELECT COUNT(*) AS count, ICD9_CODE FROM `{diagnoses_icd_table}`
GROUP BY ICD9_CODE
)
SELECT top_diag.ICD9_CODE, icd_lookup.SHORT_TITLE, top_diag.count FROM
top_diag JOIN
`{d_icd_diagnoses_table}` AS icd_lookup
USING (ICD9_CODE)
ORDER BY count DESC
top_diagnoses.head()
To compile example table for train take a lot of time
# top_m_labs = 128
# query_jobs = list()
# top_n_diagnoses = 256
# diagnosis_columns = list()
# labs_columns = list()
# for _, row in top_labs.iloc[:top_m_labs].iterrows():
# labs_columns.append('MAX(IF(ITEMID = {0}, 1.0, 0.0))'
# ' as `lab_{0}`'.format(row.ITEMID))
# for _, row in top_diagnoses.iloc[:top_n_diagnoses].iterrows():
# diagnosis_columns.append('MAX(IF(ICD9_CODE = "{0}", 1.0, 0.0))'
# ' as `icd9_{0}`'.format(row.ICD9_CODE))
# query = """
# WITH labs AS (
# SELECT
# HADM_ID,
# COUNT(*) AS COUNT_LABS,
# {labs_columns}
# FROM `{labs_event}`
# GROUP BY HADM_ID
# )
# SELECT * EXCEPT (HADM_ID) FROM
# (SELECT
# * EXCEPT (SUBJECT_ID,ITEMID,ADMITTIME,DOB),
# IF(DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25 < 200,
# DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25, 95) AS age,
# FROM
# (SELECT SUBJECT_ID,HADM_ID,ITEMID FROM `{labs_event}`)
# JOIN
# (SELECT SUBJECT_ID,HADM_ID,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,MARITAL_STATUS,ETHNICITY ,ADMITTIME,HOSPITAL_EXPIRE_FLAG AS died FROM `{admissions_table}`)
# USING (SUBJECT_ID,HADM_ID)
# JOIN
# (SELECT DOB,GENDER,SUBJECT_ID FROM`{patients_table}`)
# USING (SUBJECT_ID)
# JOIN labs
# USING (HADM_ID)
# )
# JOIN
# (SELECT
# HADM_ID,
# COUNT(*) AS num_diag,
# {diag_cols}
# FROM `{diagnoses_icd_table}`
# WHERE ICD9_CODE IS NOT NULL
# GROUP BY HADM_ID)
# USING (HADM_ID)
# """.format(m=m,labs_columns=',\n '.join(labs_columns),diag_cols=',\n '.join(diagnosis_columns),**sub_dict)
# ex.result().to_dataframe().head()
Lets go right to train model
top_m_labs = 128
query_jobs = list()
top_n_diagnoses = 256
diagnosis_columns = list()
labs_columns = list()
for _, row in top_labs.iloc[:top_m_labs].iterrows():
labs_columns.append('MAX(IF(ITEMID = {0}, 1.0, 0.0))'
' as `lab_{0}`'.format(row.ITEMID))
for _, row in top_diagnoses.iloc[:top_n_diagnoses].iterrows():
diagnosis_columns.append('MAX(IF(ICD9_CODE = "{0}", 1.0, 0.0))'
' as `icd9_{0}`'.format(row.ICD9_CODE))
query = """
CREATE OR REPLACE MODEL `{ml_table_prefix}predict_mortality_diag_labs`
OPTIONS(model_type='logistic_reg', l1_reg=2, input_label_cols=["died"])
AS
WITH labs AS (
SELECT
HADM_ID,
COUNT(*) AS COUNT_LABS,
{labs_columns}
FROM `{labs_event}`
GROUP BY HADM_ID
)
SELECT * EXCEPT (HADM_ID) FROM
(SELECT
* EXCEPT (SUBJECT_ID,ITEMID,ADMITTIME,DOB),
IF(DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25 < 200,
DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25, 95) AS age,
FROM
(SELECT SUBJECT_ID,HADM_ID,ITEMID FROM `{labs_event}`)
JOIN
(SELECT SUBJECT_ID,HADM_ID,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,MARITAL_STATUS,ETHNICITY ,ADMITTIME,HOSPITAL_EXPIRE_FLAG AS died FROM `{admissions_table}`)
USING (SUBJECT_ID,HADM_ID)
JOIN
(SELECT DOB,GENDER,SUBJECT_ID FROM`{patients_table}`)
USING (SUBJECT_ID)
JOIN labs
USING (HADM_ID)
)
JOIN
(SELECT
HADM_ID,
COUNT(*) AS num_diag,
{diag_cols}
FROM `{diagnoses_icd_table}`
WHERE ICD9_CODE IS NOT NULL
GROUP BY HADM_ID)
USING (HADM_ID)
""".format(labs_columns=',\n '.join(labs_columns),diag_cols=',\n '.join(diagnosis_columns),**sub_dict)
query_jobs.append(bq.query(query))
for j in query_jobs:
j.exception()
Evaluation
eval_queries = list()
eval_queries.append(
'SELECT * FROM ML.EVALUATE('
'MODEL `{ml_table_prefix}predict_mortality_diag_labs`)'
.format( **sub_dict))
eval_query = '\nUNION ALL\n'.join(eval_queries)
bq.query(eval_query).result().to_dataframe()
Plot precision recall curve to compare model with 128 Top Lab events+256 Top Diagnoses to model with 128 Top Lab events
df1 = bq.query('SELECT * FROM ML.ROC_CURVE('
'MODEL `{ml_table_prefix}predict_mortality_diag_labs`)'
.format(**sub_dict)).result().to_dataframe()
df2 = bq.query('SELECT * FROM ML.ROC_CURVE('
'MODEL `{ml_table_prefix}mortality_feature_top_labs_128`)'
.format(m, **sub_dict)).result().to_dataframe()
df3 = bq.query('SELECT * FROM ML.ROC_CURVE('
'MODEL `{ml_table_prefix}mortality_feature_top_labs_256`)'
.format(m, **sub_dict)).result().to_dataframe()
set_precision(df1)
plot_precision_recall(df1, label='256 diagnoses + 128 lab event')
set_precision(df2)
plot_precision_recall(df2, label='128 lab event')
set_precision(df3)
plot_precision_recall(df3, label='256 lab event')
# plt.plot(
# np.linspace(0, 1, 2), [df2.precision.min()] * 2,
# label='null model',
# linestyle='--')
plt.legend()
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel(r'Recall $\left(\frac{T_p}{T_p + F_n} \right)$')
plt.ylabel(r'Precision $\left(\frac{T_p}{T_p + F_p} \right)$')
As we can see from the graph, adding the top 256 diagnoses actually improves our model.
Lets look on model weights
%%bigquery weights_128_256
SELECT * FROM ML.WEIGHTS(MODEL `{ml_table_prefix}predict_mortality_diag_labs`)
ORDER BY weight DESC
weights_128_256.head()
First we'll look at the weights for the numerical inputs.
weights_128_256['ITEMID'] = weights_128_256.processed_input \
.apply(lambda x: x[len('icd9_'):] if x.startswith('icd9_') else x)
weights_128_256['ITEMID'] = weights_128_256.ITEMID \
.apply(lambda x: int(x.split('_')[1]) if x.startswith('lab_') else x)
top_diagnoses1=top_diagnoses.rename(columns={'ICD9_CODE': 'ITEMID','SHORT_TITLE':'LABEL','count':'COUNT'})
dff=pd.concat([top_labs,top_diagnoses1])
view_df = weights_128_256.merge(dff,how='left', on='ITEMID')
view_df = view_df[~pd.isnull(view_df.weight)]
view_df[['processed_input', 'LABEL', 'weight', 'COUNT']]
We see have a list of diagnoses and lab events, sorted from most fatal to least fatal according to our model.
Going back to our original question, we can see that the weight for num_diag (a.k.a the number of diagnoses) has essentially gone to zero (0.002395) as well as the weight for COUNT_LABS (a.k.a the number of lab events) is very small (-0.000011). The average diagnoses weight and lab event is also very small:
view_df[~pd.isnull(view_df.LABEL)].weight.mean()
Nonetheless, the graph shows that the first 20 lines are occupied by diagnoses, namely they have more weight. So we can conclude that diagnoses provide a better prediction than a lab event
Social determinant weights
df_new=weights_128_256[pd.isnull(weights_128_256['weight'])]
title=weights_128_256[pd.isnull(weights_128_256['weight'])]['processed_input'].to_list()
s=0
for i in df_new['category_weights']:
print(title[s])
for x in i:
print(x['category'],x['weight'])
s+=1
print("____________\n")
This time we will make a prediction based on the lab events done in the first 24 hours of admission
Example table for train
%%bigquery
WITH summary AS (
SELECT * FROM
(SELECT SUBJECT_ID,HADM_ID,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,
MARITAL_STATUS,ETHNICITY,GENDER,HOSPITAL_EXPIRE_FLAG as dead,
IF(DATETIME_DIFF(CHARTTIME, ADMITTIME, HOUR)<= 24,
1, 0) AS TIME,
IF(DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25 < 200,
DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25, 95) AS AGE,
FROM
(
`{admissions_table}` JOIN
`{labs_event}`
USING (HADM_ID,SUBJECT_ID)
JOIN
`{patients_table}`
USING (SUBJECT_ID)))
WHERE TIME=1)
SELECT *EXCEPT(HADM_ID,SUBJECT_ID,TIME) FROM
summary
JOIN
(SELECT count(*) as Num_labs,HADM_ID
From summary
GROUP BY HADM_ID)
USING (HADM_ID)
LIMIT 50
%%bigquery
CREATE OR REPLACE MODEL `{ml_table_prefix}mortality_24_hours`
OPTIONS(
model_type = 'logistic_reg',
# See the below aside (𝜎 = 0.5 ⇒ 𝜆 = 2)
l2_reg = 2,
input_label_cols = ["died"]
)
AS
# standard SQL query to train the model with:
WITH summary AS (
SELECT * FROM
(
SELECT SUBJECT_ID,HADM_ID,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,
MARITAL_STATUS,ETHNICITY,GENDER,HOSPITAL_EXPIRE_FLAG as died,
IF(DATETIME_DIFF(CHARTTIME, ADMITTIME, HOUR)<= 24,
1, 0) AS TIME,
IF(DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25 < 200,
DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25, 95) AS AGE,
FROM
(
`{admissions_table}` JOIN
`{labs_event}`
USING (HADM_ID,SUBJECT_ID)
JOIN
`{patients_table}`
USING (SUBJECT_ID))
)
WHERE TIME=1)
SELECT *EXCEPT(HADM_ID,SUBJECT_ID,TIME) FROM
summary
JOIN
(SELECT count(*) as Num_labs,HADM_ID
From summary
GROUP BY HADM_ID)
USING (HADM_ID)
eval_queries = list()
eval_queries.append(
'SELECT * FROM ML.EVALUATE('
'MODEL `{ml_table_prefix}mortality_24_hours`)'
.format( **sub_dict))
eval_query = '\nUNION ALL\n'.join(eval_queries)
bq.query(eval_query).result().to_dataframe()
%%bigquery weights_24
SELECT * FROM ML.WEIGHTS(MODEL `{ml_table_prefix}mortality_24_hours`)
ORDER BY weight DESC
weights_24
We see that the Num_labs weight really increased from 0.000508 to 0.006802.But how much will this affect on predictive power?
df1 = bq.query('SELECT * FROM ML.ROC_CURVE('
'MODEL `{ml_table_prefix}mortality_24_hours`)'
.format(**sub_dict)).result().to_dataframe()
df2 = bq.query('SELECT * FROM ML.ROC_CURVE('
'MODEL `{ml_table_prefix}complexity_feature_mortality`)'
.format(m, **sub_dict)).result().to_dataframe()
df3 = bq.query('SELECT * FROM ML.ROC_CURVE('
'MODEL `{ml_table_prefix}predict_mortality_diag_labs`)'
.format(m, **sub_dict)).result().to_dataframe()
df4 = bq.query('SELECT * FROM ML.ROC_CURVE('
'MODEL `{ml_table_prefix}mortality_feature_top_labs_128`)'
.format(m, **sub_dict)).result().to_dataframe()
set_precision(df1)
plot_precision_recall(df1, label='lab event in the first 24 hours')
set_precision(df2)
plot_precision_recall(df2, label='all numbers of lab events')
set_precision(df3)
plot_precision_recall(df3, label='256 diagnoses + 128 lab event')
set_precision(df4)
plot_precision_recall(df4, label='top 128 lab event')
# plt.plot(
# np.linspace(0, 1, 2), [df2.precision.min()] * 2,
# label='null model',
# linestyle='--')
plt.legend()
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel(r'Recall $\left(\frac{T_p}{T_p + F_n} \right)$')
plt.ylabel(r'Precision $\left(\frac{T_p}{T_p + F_p} \right)$')
We see that considering only the LAB EVENTS in the first 24 hours allows us to improve our BASIC model (only LAB EVENTS). As further work, we can consider the one hot encoding of top diagnoses and top lab events assigned also in the first 24 hours. Nevertheless, we see that the use of diagnoses and labs event allows us to create a serious model (ROC 0.88).